From World Health Organization - On 31 December 2019, WHO was alerted to several cases of pneumonia in Wuhan City, Hubei Province of China. Since the beginning of the coronavirus pandemic, WHO & Our World in Data team are collecting datasets on daily basis the number of COVID-19 cases and deaths, based on reports from health authorities worldwide. To insure the accuracy and reliability of the data, this process is being constantly refined. This helps to monitor and interpret the dynamics of the COVID-19 pandemic not only in the European Union (EU), the European Economic Area (EEA), but also worldwide.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
For this project we are using dataset provided by https://ourworldindata.org/coronavirus-source-data . Here we will be visualizing the current trend of cases and trends in Asia & especially in Nepal. So, let's begin the project by importing the dataset.
covid_data = pd.read_csv('owid-covid-data.csv',sep=',')
covid_data.head()
covid_data_countrydate = covid_data[covid_data['new_cases']>0]
covid_data_countrydate = covid_data_countrydate.groupby(['date','location']).sum().reset_index()
fig = px.choropleth(covid_data_countrydate,
locations="location",
locationmode = "country names",
color="new_cases",
hover_name="location",
animation_frame="date"
)
fig.update_layout(
title_text = 'Spread of Coronavirus',
title_x = 0.5,
geo=dict(
showframe = False,
showcoastlines = False,
))
fig.show()
Here, we are visualizing the corona datasets only from Asia. So, we will be selecting datasets related to Asia continent only.
covid_data1=covid_data.loc[covid_data['continent'] == 'Asia']
covid_data1
covid = covid_data1.groupby(['location', 'date']).max().reset_index().sort_values('date', ascending=False)
covid = covid.drop_duplicates(subset = ['location'])
covid = covid[covid['total_cases']>0]
covid.head()
fig = go.Figure(data=go.Choropleth(
locations = covid['location'],
locationmode = 'country names',
z = covid['new_cases'],
colorscale = 'Reds',
marker_line_color = 'black',
marker_line_width = 0.5
))
fig.update_layout(
title_text = 'New cases As of October 13 : Asia',
title_x = 0.5,
geo=dict(
showframe = False,
showcoastlines = False,
projection_type = 'equirectangular'
)
)
df_no_india = covid[covid['location'] != 'India']
fig = go.Figure(data=go.Choropleth(
locations = df_no_india['location'],
locationmode = 'country names',
z = df_no_india['new_cases'],
colorscale = 'Reds',
marker_line_color = 'black',
marker_line_width = 0.5
))
fig.update_layout(
title_text = 'New cases As of October 13 : Asia(Excluding India)',
title_x = 0.5,
geo=dict(
showframe = False,
showcoastlines = False,
projection_type = 'equirectangular'
)
)
In above dataset we have lot's of columns(feature labels). We aren't using all of them in our project so we will select only required columns from our dataset. So let's make it clear at first: We will be using new cases, total cases, new deaths and total cases from Asia & Nepal
df = covid_data1[['continent','location', 'date','new_tests','new_cases','new_deaths','total_cases','total_deaths']]
df
There might be some missing data and NAN data. So we will be replacing them by Zero. We could have used mean data but it's our first project on sagemaker so i will be keeping it a lot simple
df.fillna(0)
df11=covid_data.loc[covid_data['location'] == 'Nepal']
bar_data = df11.groupby(['date'])['new_cases'].sum().reset_index().sort_values('date', ascending=True)
fig = px.bar(bar_data, x="date", y="new_cases", text = 'new_cases', orientation='v', height=600,
title='Confirmed Cases In Nepal Till October 13')
fig.show()
def plot_var(var='new_deaths',
location='Nepal'):
"""
Plots a bar chart of the given variable over the date range
"""
assert type(var)==str, "Expected string as the variable name"
assert type(location)==str, "Expected string as the state name"
y = df[df['location']==location][var][-31:]
x = df[df['location']==location]['date'][-31:]
plt.figure(figsize=(12,4))
plt.title("{} for {} In last 30 Days".format(var,location),fontsize=18)
plt.bar(x=x,height=y,edgecolor='k',color='orange')
plt.grid(True)
plt.xticks(fontsize=14,rotation=45)
plt.yticks(fontsize=14)
plt.show()
plot_var('new_cases')
plot_var('new_deaths')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
covid_data = pd.read_csv('owid-covid-data.csv',sep=',')
df=covid_data.loc[covid_data['location'] == 'Nepal']
df = df[['continent','location', 'date','new_cases','new_deaths','total_cases','total_deaths']]
df.fillna(0)
covidByDay =df.groupby(['date'])[['total_cases']].sum().sort_values('date', ascending=False)
covidByDay.head()
labels = covidByDay.index.get_level_values(0).values
plt.figure(figsize=(24, 6))
ax = sns.lineplot(data=covidByDay, palette="tab10", linewidth=2.5)
ax.set_xticklabels(labels, rotation=70, horizontalalignment='right')
ax.set_ylabel('Total Cases')
ax.set_title('Cases of COVID-19 In Nepal')
ax.margins(0)
n = 7 # Keeps every 7th label
[l.set_visible(False) for (i,l) in enumerate(ax.xaxis.get_ticklabels()) if i % n != 0]
ax
Since the number of cases for the first few days is very small, there is large fluctuation in the early part of the graph, after which it stabilizes to a nearly straight line. Hence, we choose to ignore the low valued-data in order to develop a with a better fit.
# only work with a limited amount of data
df = df[df['total_cases'] > 10000]
df
ar=list(range(1,112))
df.insert(0,"SN",ar,True)
df
x1 = np.array(df["SN"]).reshape(-1,1)
y = np.array(df['total_cases']).reshape(-1,1)
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
print('--'*15,end ='');print('polynomial model training',end ='');print('--'*10)
for i in range(1,6):
polyfet = PolynomialFeatures(degree=i)
xa = polyfet.fit_transform(x1)
model = linear_model.LinearRegression()
model.fit(xa,y)
accuracy = model.score(xa,y)
print('accuracy(R2) with degree_{} is --> {}%'.format(i , round(accuracy*100,3)))
print('--'*45)
polyfet = PolynomialFeatures(degree=4) #you can change degree
xa = polyfet.fit_transform(x1)
model = linear_model.LinearRegression()
model.fit(xa,y)
yp = model.predict(xa)
yact = np.array(df['total_cases'])#.reshape(-1,1)
plt.figure(figsize=(8, 6))
plt.plot(yp,"--b")
plt.plot(yact,"-g")
plt.legend(['pred', 'actual'])
plt.xticks()
# plt.yticks([])
plt.title("comparing actual and pred", fontdict=None, loc='center')
plt.show()
x_fut = np.arange(30).reshape(-1,1)
xf = x_fut+x1[-1:]
y_fut = (model.predict(polyfet.transform(xf))).astype(int)
plt.figure(figsize=(16, 10))
plt.plot(x1,yp,"--b")
plt.plot(x1,yact,"-g")
plt.plot(xf,y_fut,"--r")
plt.legend(['predicted', 'actual',"future_pred"])
plt.xticks()
plt.title("comparing actual and pred", fontdict=None, loc='center')
plt.show()
#prediction after 7 days
days = 7
print("Corona Cases after {} day - ".format(days), end='')
print(round(int(model.predict(polyfet.transform(np.array(x1[-1:]+days).reshape(-1,1)))),2))
#prediction of corona cases after 30 days
days = 30
print("Corona Cases after {} day - ".format(days), end='')
print(round(int(model.predict(polyfet.transform(np.array(x1[-1:]+days).reshape(-1,1)))),2))